; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s

define arm_aapcs_vfpcc <8 x i16> @test_vldrbq_gather_offset_s16(ptr %base, <8 x i16> %offset) {
; CHECK-LABEL: test_vldrbq_gather_offset_s16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrb.s16 q1, [r0, q0]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0.v8i16(ptr %base, <8 x i16> %offset, i32 8, i32 0, i32 0)
  ret <8 x i16> %0
}

declare <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0.v8i16(ptr, <8 x i16>, i32, i32, i32)

define arm_aapcs_vfpcc <4 x i32> @test_vldrbq_gather_offset_s32(ptr %base, <4 x i32> %offset) {
; CHECK-LABEL: test_vldrbq_gather_offset_s32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrb.s32 q1, [r0, q0]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0.v4i32(ptr %base, <4 x i32> %offset, i32 8, i32 0, i32 0)
  ret <4 x i32> %0
}

declare <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0.v4i32(ptr, <4 x i32>, i32, i32, i32)

define arm_aapcs_vfpcc <16 x i8> @test_vldrbq_gather_offset_s8(ptr %base, <16 x i8> %offset) {
; CHECK-LABEL: test_vldrbq_gather_offset_s8:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrb.u8 q1, [r0, q0]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = call <16 x i8> @llvm.arm.mve.vldr.gather.offset.v16i8.p0.v16i8(ptr %base, <16 x i8> %offset, i32 8, i32 0, i32 0)
  ret <16 x i8> %0
}

declare <16 x i8> @llvm.arm.mve.vldr.gather.offset.v16i8.p0.v16i8(ptr, <16 x i8>, i32, i32, i32)

define arm_aapcs_vfpcc <8 x i16> @test_vldrbq_gather_offset_u16(ptr %base, <8 x i16> %offset) {
; CHECK-LABEL: test_vldrbq_gather_offset_u16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrb.u16 q1, [r0, q0]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0.v8i16(ptr %base, <8 x i16> %offset, i32 8, i32 0, i32 1)
  ret <8 x i16> %0
}

define arm_aapcs_vfpcc <4 x i32> @test_vldrbq_gather_offset_u32(ptr %base, <4 x i32> %offset) {
; CHECK-LABEL: test_vldrbq_gather_offset_u32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrb.u32 q1, [r0, q0]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0.v4i32(ptr %base, <4 x i32> %offset, i32 8, i32 0, i32 1)
  ret <4 x i32> %0
}

define arm_aapcs_vfpcc <16 x i8> @test_vldrbq_gather_offset_u8(ptr %base, <16 x i8> %offset) {
; CHECK-LABEL: test_vldrbq_gather_offset_u8:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrb.u8 q1, [r0, q0]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = call <16 x i8> @llvm.arm.mve.vldr.gather.offset.v16i8.p0.v16i8(ptr %base, <16 x i8> %offset, i32 8, i32 0, i32 1)
  ret <16 x i8> %0
}

define arm_aapcs_vfpcc <8 x i16> @test_vldrbq_gather_offset_z_s16(ptr %base, <8 x i16> %offset, i16 zeroext %p) {
; CHECK-LABEL: test_vldrbq_gather_offset_z_s16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrbt.s16 q1, [r0, q0]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
  %2 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0.v8i16.v8i1(ptr %base, <8 x i16> %offset, i32 8, i32 0, i32 0, <8 x i1> %1)
  ret <8 x i16> %2
}

declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)

declare <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0.v8i16.v8i1(ptr, <8 x i16>, i32, i32, i32, <8 x i1>)

define arm_aapcs_vfpcc <4 x i32> @test_vldrbq_gather_offset_z_s32(ptr %base, <4 x i32> %offset, i16 zeroext %p) {
; CHECK-LABEL: test_vldrbq_gather_offset_z_s32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrbt.s32 q1, [r0, q0]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
  %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0.v4i32.v4i1(ptr %base, <4 x i32> %offset, i32 8, i32 0, i32 0, <4 x i1> %1)
  ret <4 x i32> %2
}

declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)

declare <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0.v4i32.v4i1(ptr, <4 x i32>, i32, i32, i32, <4 x i1>)

define arm_aapcs_vfpcc <16 x i8> @test_vldrbq_gather_offset_z_s8(ptr %base, <16 x i8> %offset, i16 zeroext %p) {
; CHECK-LABEL: test_vldrbq_gather_offset_z_s8:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrbt.u8 q1, [r0, q0]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
  %2 = call <16 x i8> @llvm.arm.mve.vldr.gather.offset.predicated.v16i8.p0.v16i8.v16i1(ptr %base, <16 x i8> %offset, i32 8, i32 0, i32 0, <16 x i1> %1)
  ret <16 x i8> %2
}

declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32)

declare <16 x i8> @llvm.arm.mve.vldr.gather.offset.predicated.v16i8.p0.v16i8.v16i1(ptr, <16 x i8>, i32, i32, i32, <16 x i1>)

define arm_aapcs_vfpcc <8 x i16> @test_vldrbq_gather_offset_z_u16(ptr %base, <8 x i16> %offset, i16 zeroext %p) {
; CHECK-LABEL: test_vldrbq_gather_offset_z_u16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrbt.u16 q1, [r0, q0]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
  %2 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0.v8i16.v8i1(ptr %base, <8 x i16> %offset, i32 8, i32 0, i32 1, <8 x i1> %1)
  ret <8 x i16> %2
}

define arm_aapcs_vfpcc <4 x i32> @test_vldrbq_gather_offset_z_u32(ptr %base, <4 x i32> %offset, i16 zeroext %p) {
; CHECK-LABEL: test_vldrbq_gather_offset_z_u32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrbt.u32 q1, [r0, q0]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
  %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0.v4i32.v4i1(ptr %base, <4 x i32> %offset, i32 8, i32 0, i32 1, <4 x i1> %1)
  ret <4 x i32> %2
}

define arm_aapcs_vfpcc <16 x i8> @test_vldrbq_gather_offset_z_u8(ptr %base, <16 x i8> %offset, i16 zeroext %p) {
; CHECK-LABEL: test_vldrbq_gather_offset_z_u8:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrbt.u8 q1, [r0, q0]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
  %2 = call <16 x i8> @llvm.arm.mve.vldr.gather.offset.predicated.v16i8.p0.v16i8.v16i1(ptr %base, <16 x i8> %offset, i32 8, i32 0, i32 1, <16 x i1> %1)
  ret <16 x i8> %2
}

define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_s64(<2 x i64> %addr) {
; CHECK-LABEL: test_vldrdq_gather_base_s64:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrd.u64 q1, [q0, #616]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = call <2 x i64> @llvm.arm.mve.vldr.gather.base.v2i64.v2i64(<2 x i64> %addr, i32 616)
  ret <2 x i64> %0
}

declare <2 x i64> @llvm.arm.mve.vldr.gather.base.v2i64.v2i64(<2 x i64>, i32)

define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_u64(<2 x i64> %addr) {
; CHECK-LABEL: test_vldrdq_gather_base_u64:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrd.u64 q1, [q0, #-336]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = call <2 x i64> @llvm.arm.mve.vldr.gather.base.v2i64.v2i64(<2 x i64> %addr, i32 -336)
  ret <2 x i64> %0
}

define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_wb_s64(ptr %addr) {
; CHECK-LABEL: test_vldrdq_gather_base_wb_s64:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrw.u32 q1, [r0]
; CHECK-NEXT:    vldrd.u64 q0, [q1, #576]!
; CHECK-NEXT:    vstrw.32 q1, [r0]
; CHECK-NEXT:    bx lr
entry:
  %0 = load <2 x i64>, ptr %addr, align 8
  %1 = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.v2i64.v2i64(<2 x i64> %0, i32 576)
  %2 = extractvalue { <2 x i64>, <2 x i64> } %1, 1
  store <2 x i64> %2, ptr %addr, align 8
  %3 = extractvalue { <2 x i64>, <2 x i64> } %1, 0
  ret <2 x i64> %3
}

declare { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.v2i64.v2i64(<2 x i64>, i32)

define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_wb_u64(ptr %addr) {
; CHECK-LABEL: test_vldrdq_gather_base_wb_u64:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrw.u32 q1, [r0]
; CHECK-NEXT:    vldrd.u64 q0, [q1, #-328]!
; CHECK-NEXT:    vstrw.32 q1, [r0]
; CHECK-NEXT:    bx lr
entry:
  %0 = load <2 x i64>, ptr %addr, align 8
  %1 = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.v2i64.v2i64(<2 x i64> %0, i32 -328)
  %2 = extractvalue { <2 x i64>, <2 x i64> } %1, 1
  store <2 x i64> %2, ptr %addr, align 8
  %3 = extractvalue { <2 x i64>, <2 x i64> } %1, 0
  ret <2 x i64> %3
}

define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_wb_z_s64(ptr %addr, i16 zeroext %p) {
; CHECK-LABEL: test_vldrdq_gather_base_wb_z_s64:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vldrw.u32 q1, [r0]
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrdt.u64 q0, [q1, #664]!
; CHECK-NEXT:    vstrw.32 q1, [r0]
; CHECK-NEXT:    bx lr
entry:
  %0 = load <2 x i64>, ptr %addr, align 8
  %1 = zext i16 %p to i32
  %2 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %1)
  %3 = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v2i1(<2 x i64> %0, i32 664, <2 x i1> %2)
  %4 = extractvalue { <2 x i64>, <2 x i64> } %3, 1
  store <2 x i64> %4, ptr %addr, align 8
  %5 = extractvalue { <2 x i64>, <2 x i64> } %3, 0
  ret <2 x i64> %5
}

declare <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32)
declare { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v2i1(<2 x i64>, i32, <2 x i1>)

define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_wb_z_u64(ptr %addr, i16 zeroext %p) {
; CHECK-LABEL: test_vldrdq_gather_base_wb_z_u64:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vldrw.u32 q1, [r0]
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrdt.u64 q0, [q1, #656]!
; CHECK-NEXT:    vstrw.32 q1, [r0]
; CHECK-NEXT:    bx lr
entry:
  %0 = load <2 x i64>, ptr %addr, align 8
  %1 = zext i16 %p to i32
  %2 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %1)
  %3 = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v2i1(<2 x i64> %0, i32 656, <2 x i1> %2)
  %4 = extractvalue { <2 x i64>, <2 x i64> } %3, 1
  store <2 x i64> %4, ptr %addr, align 8
  %5 = extractvalue { <2 x i64>, <2 x i64> } %3, 0
  ret <2 x i64> %5
}

define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_z_s64(<2 x i64> %addr, i16 zeroext %p) {
; CHECK-LABEL: test_vldrdq_gather_base_z_s64:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r0
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrdt.u64 q1, [q0, #888]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0)
  %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.base.predicated.v2i64.v2i64.v2i1(<2 x i64> %addr, i32 888, <2 x i1> %1)
  ret <2 x i64> %2
}

declare <2 x i64> @llvm.arm.mve.vldr.gather.base.predicated.v2i64.v2i64.v2i1(<2 x i64>, i32, <2 x i1>)

define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_z_u64(<2 x i64> %addr, i16 zeroext %p) {
; CHECK-LABEL: test_vldrdq_gather_base_z_u64:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r0
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrdt.u64 q1, [q0, #-1000]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0)
  %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.base.predicated.v2i64.v2i64.v2i1(<2 x i64> %addr, i32 -1000, <2 x i1> %1)
  ret <2 x i64> %2
}

define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_offset_s64(ptr %base, <2 x i64> %offset) {
; CHECK-LABEL: test_vldrdq_gather_offset_s64:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrd.u64 q1, [r0, q0]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.v2i64.p0.v2i64(ptr %base, <2 x i64> %offset, i32 64, i32 0, i32 0)
  ret <2 x i64> %0
}

declare <2 x i64> @llvm.arm.mve.vldr.gather.offset.v2i64.p0.v2i64(ptr, <2 x i64>, i32, i32, i32)

define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_offset_u64(ptr %base, <2 x i64> %offset) {
; CHECK-LABEL: test_vldrdq_gather_offset_u64:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrd.u64 q1, [r0, q0]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.v2i64.p0.v2i64(ptr %base, <2 x i64> %offset, i32 64, i32 0, i32 1)
  ret <2 x i64> %0
}

define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_offset_z_s64(ptr %base, <2 x i64> %offset, i16 zeroext %p) {
; CHECK-LABEL: test_vldrdq_gather_offset_z_s64:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrdt.u64 q1, [r0, q0]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0)
  %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0.v2i64.v2i1(ptr %base, <2 x i64> %offset, i32 64, i32 0, i32 0, <2 x i1> %1)
  ret <2 x i64> %2
}

declare <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0.v2i64.v2i1(ptr, <2 x i64>, i32, i32, i32, <2 x i1>)

define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_offset_z_u64(ptr %base, <2 x i64> %offset, i16 zeroext %p) {
; CHECK-LABEL: test_vldrdq_gather_offset_z_u64:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrdt.u64 q1, [r0, q0]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0)
  %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0.v2i64.v2i1(ptr %base, <2 x i64> %offset, i32 64, i32 0, i32 1, <2 x i1> %1)
  ret <2 x i64> %2
}

define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_shifted_offset_s64(ptr %base, <2 x i64> %offset) {
; CHECK-LABEL: test_vldrdq_gather_shifted_offset_s64:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrd.u64 q1, [r0, q0, uxtw #3]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.v2i64.p0.v2i64(ptr %base, <2 x i64> %offset, i32 64, i32 3, i32 0)
  ret <2 x i64> %0
}

define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_shifted_offset_u64(ptr %base, <2 x i64> %offset) {
; CHECK-LABEL: test_vldrdq_gather_shifted_offset_u64:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrd.u64 q1, [r0, q0, uxtw #3]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.v2i64.p0.v2i64(ptr %base, <2 x i64> %offset, i32 64, i32 3, i32 1)
  ret <2 x i64> %0
}

define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_shifted_offset_z_s64(ptr %base, <2 x i64> %offset, i16 zeroext %p) {
; CHECK-LABEL: test_vldrdq_gather_shifted_offset_z_s64:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrdt.u64 q1, [r0, q0, uxtw #3]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0)
  %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0.v2i64.v2i1(ptr %base, <2 x i64> %offset, i32 64, i32 3, i32 0, <2 x i1> %1)
  ret <2 x i64> %2
}

define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_shifted_offset_z_u64(ptr %base, <2 x i64> %offset, i16 zeroext %p) {
; CHECK-LABEL: test_vldrdq_gather_shifted_offset_z_u64:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrdt.u64 q1, [r0, q0, uxtw #3]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0)
  %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0.v2i64.v2i1(ptr %base, <2 x i64> %offset, i32 64, i32 3, i32 1, <2 x i1> %1)
  ret <2 x i64> %2
}

define arm_aapcs_vfpcc <8 x half> @test_vldrhq_gather_offset_f16(ptr %base, <8 x i16> %offset) {
; CHECK-LABEL: test_vldrhq_gather_offset_f16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrh.u16 q1, [r0, q0]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = call <8 x half> @llvm.arm.mve.vldr.gather.offset.v8f16.p0.v8i16(ptr %base, <8 x i16> %offset, i32 16, i32 0, i32 0)
  ret <8 x half> %0
}

declare <8 x half> @llvm.arm.mve.vldr.gather.offset.v8f16.p0.v8i16(ptr, <8 x i16>, i32, i32, i32)

define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_gather_offset_s16(ptr %base, <8 x i16> %offset) {
; CHECK-LABEL: test_vldrhq_gather_offset_s16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrh.u16 q1, [r0, q0]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0.v8i16(ptr %base, <8 x i16> %offset, i32 16, i32 0, i32 0)
  ret <8 x i16> %0
}


define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_gather_offset_s32(ptr %base, <4 x i32> %offset) {
; CHECK-LABEL: test_vldrhq_gather_offset_s32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrh.s32 q1, [r0, q0]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0.v4i32(ptr %base, <4 x i32> %offset, i32 16, i32 0, i32 0)
  ret <4 x i32> %0
}


define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_gather_offset_u16(ptr %base, <8 x i16> %offset) {
; CHECK-LABEL: test_vldrhq_gather_offset_u16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrh.u16 q1, [r0, q0]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0.v8i16(ptr %base, <8 x i16> %offset, i32 16, i32 0, i32 1)
  ret <8 x i16> %0
}

define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_gather_offset_u32(ptr %base, <4 x i32> %offset) {
; CHECK-LABEL: test_vldrhq_gather_offset_u32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrh.u32 q1, [r0, q0]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0.v4i32(ptr %base, <4 x i32> %offset, i32 16, i32 0, i32 1)
  ret <4 x i32> %0
}

define arm_aapcs_vfpcc <8 x half> @test_vldrhq_gather_offset_z_f16(ptr %base, <8 x i16> %offset, i16 zeroext %p) {
; CHECK-LABEL: test_vldrhq_gather_offset_z_f16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrht.u16 q1, [r0, q0]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
  %2 = call <8 x half> @llvm.arm.mve.vldr.gather.offset.predicated.v8f16.p0.v8i16.v8i1(ptr %base, <8 x i16> %offset, i32 16, i32 0, i32 0, <8 x i1> %1)
  ret <8 x half> %2
}

declare <8 x half> @llvm.arm.mve.vldr.gather.offset.predicated.v8f16.p0.v8i16.v8i1(ptr, <8 x i16>, i32, i32, i32, <8 x i1>)

define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_gather_offset_z_s16(ptr %base, <8 x i16> %offset, i16 zeroext %p) {
; CHECK-LABEL: test_vldrhq_gather_offset_z_s16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrht.u16 q1, [r0, q0]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
  %2 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0.v8i16.v8i1(ptr %base, <8 x i16> %offset, i32 16, i32 0, i32 0, <8 x i1> %1)
  ret <8 x i16> %2
}


define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_gather_offset_z_s32(ptr %base, <4 x i32> %offset, i16 zeroext %p) {
; CHECK-LABEL: test_vldrhq_gather_offset_z_s32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrht.s32 q1, [r0, q0]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
  %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0.v4i32.v4i1(ptr %base, <4 x i32> %offset, i32 16, i32 0, i32 0, <4 x i1> %1)
  ret <4 x i32> %2
}


define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_gather_offset_z_u16(ptr %base, <8 x i16> %offset, i16 zeroext %p) {
; CHECK-LABEL: test_vldrhq_gather_offset_z_u16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrht.u16 q1, [r0, q0]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
  %2 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0.v8i16.v8i1(ptr %base, <8 x i16> %offset, i32 16, i32 0, i32 1, <8 x i1> %1)
  ret <8 x i16> %2
}

define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_gather_offset_z_u32(ptr %base, <4 x i32> %offset, i16 zeroext %p) {
; CHECK-LABEL: test_vldrhq_gather_offset_z_u32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrht.u32 q1, [r0, q0]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
  %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0.v4i32.v4i1(ptr %base, <4 x i32> %offset, i32 16, i32 0, i32 1, <4 x i1> %1)
  ret <4 x i32> %2
}

define arm_aapcs_vfpcc <8 x half> @test_vldrhq_gather_shifted_offset_f16(ptr %base, <8 x i16> %offset) {
; CHECK-LABEL: test_vldrhq_gather_shifted_offset_f16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrh.u16 q1, [r0, q0, uxtw #1]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = call <8 x half> @llvm.arm.mve.vldr.gather.offset.v8f16.p0.v8i16(ptr %base, <8 x i16> %offset, i32 16, i32 1, i32 0)
  ret <8 x half> %0
}

define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_gather_shifted_offset_s16(ptr %base, <8 x i16> %offset) {
; CHECK-LABEL: test_vldrhq_gather_shifted_offset_s16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrh.u16 q1, [r0, q0, uxtw #1]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0.v8i16(ptr %base, <8 x i16> %offset, i32 16, i32 1, i32 0)
  ret <8 x i16> %0
}

define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_gather_shifted_offset_s32(ptr %base, <4 x i32> %offset) {
; CHECK-LABEL: test_vldrhq_gather_shifted_offset_s32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrh.s32 q1, [r0, q0, uxtw #1]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0.v4i32(ptr %base, <4 x i32> %offset, i32 16, i32 1, i32 0)
  ret <4 x i32> %0
}

define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_gather_shifted_offset_u16(ptr %base, <8 x i16> %offset) {
; CHECK-LABEL: test_vldrhq_gather_shifted_offset_u16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrh.u16 q1, [r0, q0, uxtw #1]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0.v8i16(ptr %base, <8 x i16> %offset, i32 16, i32 1, i32 1)
  ret <8 x i16> %0
}

define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_gather_shifted_offset_u32(ptr %base, <4 x i32> %offset) {
; CHECK-LABEL: test_vldrhq_gather_shifted_offset_u32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrh.u32 q1, [r0, q0, uxtw #1]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0.v4i32(ptr %base, <4 x i32> %offset, i32 16, i32 1, i32 1)
  ret <4 x i32> %0
}

define arm_aapcs_vfpcc <8 x half> @test_vldrhq_gather_shifted_offset_z_f16(ptr %base, <8 x i16> %offset, i16 zeroext %p) {
; CHECK-LABEL: test_vldrhq_gather_shifted_offset_z_f16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrht.u16 q1, [r0, q0, uxtw #1]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
  %2 = call <8 x half> @llvm.arm.mve.vldr.gather.offset.predicated.v8f16.p0.v8i16.v8i1(ptr %base, <8 x i16> %offset, i32 16, i32 1, i32 0, <8 x i1> %1)
  ret <8 x half> %2
}

define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_gather_shifted_offset_z_s16(ptr %base, <8 x i16> %offset, i16 zeroext %p) {
; CHECK-LABEL: test_vldrhq_gather_shifted_offset_z_s16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrht.u16 q1, [r0, q0, uxtw #1]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
  %2 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0.v8i16.v8i1(ptr %base, <8 x i16> %offset, i32 16, i32 1, i32 0, <8 x i1> %1)
  ret <8 x i16> %2
}

define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_gather_shifted_offset_z_s32(ptr %base, <4 x i32> %offset, i16 zeroext %p) {
; CHECK-LABEL: test_vldrhq_gather_shifted_offset_z_s32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrht.s32 q1, [r0, q0, uxtw #1]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
  %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0.v4i32.v4i1(ptr %base, <4 x i32> %offset, i32 16, i32 1, i32 0, <4 x i1> %1)
  ret <4 x i32> %2
}

define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_gather_shifted_offset_z_u16(ptr %base, <8 x i16> %offset, i16 zeroext %p) {
; CHECK-LABEL: test_vldrhq_gather_shifted_offset_z_u16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrht.u16 q1, [r0, q0, uxtw #1]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
  %2 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0.v8i16.v8i1(ptr %base, <8 x i16> %offset, i32 16, i32 1, i32 1, <8 x i1> %1)
  ret <8 x i16> %2
}

define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_gather_shifted_offset_z_u32(ptr %base, <4 x i32> %offset, i16 zeroext %p) {
; CHECK-LABEL: test_vldrhq_gather_shifted_offset_z_u32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrht.u32 q1, [r0, q0, uxtw #1]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
  %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0.v4i32.v4i1(ptr %base, <4 x i32> %offset, i32 16, i32 1, i32 1, <4 x i1> %1)
  ret <4 x i32> %2
}

define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_base_f32(<4 x i32> %addr) {
; CHECK-LABEL: test_vldrwq_gather_base_f32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrw.u32 q1, [q0, #12]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = call <4 x float> @llvm.arm.mve.vldr.gather.base.v4f32.v4i32(<4 x i32> %addr, i32 12)
  ret <4 x float> %0
}

declare <4 x float> @llvm.arm.mve.vldr.gather.base.v4f32.v4i32(<4 x i32>, i32)

define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_s32(<4 x i32> %addr) {
; CHECK-LABEL: test_vldrwq_gather_base_s32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrw.u32 q1, [q0, #400]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.base.v4i32.v4i32(<4 x i32> %addr, i32 400)
  ret <4 x i32> %0
}

declare <4 x i32> @llvm.arm.mve.vldr.gather.base.v4i32.v4i32(<4 x i32>, i32)

define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_u32(<4 x i32> %addr) {
; CHECK-LABEL: test_vldrwq_gather_base_u32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrw.u32 q1, [q0, #284]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.base.v4i32.v4i32(<4 x i32> %addr, i32 284)
  ret <4 x i32> %0
}

define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_base_wb_f32(ptr %addr) {
; CHECK-LABEL: test_vldrwq_gather_base_wb_f32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrw.u32 q1, [r0]
; CHECK-NEXT:    vldrw.u32 q0, [q1, #-64]!
; CHECK-NEXT:    vstrw.32 q1, [r0]
; CHECK-NEXT:    bx lr
entry:
  %0 = load <4 x i32>, ptr %addr, align 8
  %1 = call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.v4f32.v4i32(<4 x i32> %0, i32 -64)
  %2 = extractvalue { <4 x float>, <4 x i32> } %1, 1
  store <4 x i32> %2, ptr %addr, align 8
  %3 = extractvalue { <4 x float>, <4 x i32> } %1, 0
  ret <4 x float> %3
}

declare { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.v4f32.v4i32(<4 x i32>, i32)

define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_wb_s32(ptr %addr) {
; CHECK-LABEL: test_vldrwq_gather_base_wb_s32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrw.u32 q1, [r0]
; CHECK-NEXT:    vldrw.u32 q0, [q1, #80]!
; CHECK-NEXT:    vstrw.32 q1, [r0]
; CHECK-NEXT:    bx lr
entry:
  %0 = load <4 x i32>, ptr %addr, align 8
  %1 = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.v4i32.v4i32(<4 x i32> %0, i32 80)
  %2 = extractvalue { <4 x i32>, <4 x i32> } %1, 1
  store <4 x i32> %2, ptr %addr, align 8
  %3 = extractvalue { <4 x i32>, <4 x i32> } %1, 0
  ret <4 x i32> %3
}

declare { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.v4i32.v4i32(<4 x i32>, i32)

define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_wb_u32(ptr %addr) {
; CHECK-LABEL: test_vldrwq_gather_base_wb_u32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrw.u32 q1, [r0]
; CHECK-NEXT:    vldrw.u32 q0, [q1, #480]!
; CHECK-NEXT:    vstrw.32 q1, [r0]
; CHECK-NEXT:    bx lr
entry:
  %0 = load <4 x i32>, ptr %addr, align 8
  %1 = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.v4i32.v4i32(<4 x i32> %0, i32 480)
  %2 = extractvalue { <4 x i32>, <4 x i32> } %1, 1
  store <4 x i32> %2, ptr %addr, align 8
  %3 = extractvalue { <4 x i32>, <4 x i32> } %1, 0
  ret <4 x i32> %3
}

define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_base_wb_z_f32(ptr %addr, i16 zeroext %p) {
; CHECK-LABEL: test_vldrwq_gather_base_wb_z_f32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vldrw.u32 q1, [r0]
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrwt.u32 q0, [q1, #-352]!
; CHECK-NEXT:    vstrw.32 q1, [r0]
; CHECK-NEXT:    bx lr
entry:
  %0 = load <4 x i32>, ptr %addr, align 8
  %1 = zext i16 %p to i32
  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
  %3 = call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %0, i32 -352, <4 x i1> %2)
  %4 = extractvalue { <4 x float>, <4 x i32> } %3, 1
  store <4 x i32> %4, ptr %addr, align 8
  %5 = extractvalue { <4 x float>, <4 x i32> } %3, 0
  ret <4 x float> %5
}

declare { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32>, i32, <4 x i1>)

define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_wb_z_s32(ptr %addr, i16 zeroext %p) {
; CHECK-LABEL: test_vldrwq_gather_base_wb_z_s32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vldrw.u32 q1, [r0]
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrwt.u32 q0, [q1, #276]!
; CHECK-NEXT:    vstrw.32 q1, [r0]
; CHECK-NEXT:    bx lr
entry:
  %0 = load <4 x i32>, ptr %addr, align 8
  %1 = zext i16 %p to i32
  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
  %3 = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4i32.v4i32.v4i1(<4 x i32> %0, i32 276, <4 x i1> %2)
  %4 = extractvalue { <4 x i32>, <4 x i32> } %3, 1
  store <4 x i32> %4, ptr %addr, align 8
  %5 = extractvalue { <4 x i32>, <4 x i32> } %3, 0
  ret <4 x i32> %5
}

declare { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4i32.v4i32.v4i1(<4 x i32>, i32, <4 x i1>)

define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_wb_z_u32(ptr %addr, i16 zeroext %p) {
; CHECK-LABEL: test_vldrwq_gather_base_wb_z_u32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vldrw.u32 q1, [r0]
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrwt.u32 q0, [q1, #88]!
; CHECK-NEXT:    vstrw.32 q1, [r0]
; CHECK-NEXT:    bx lr
entry:
  %0 = load <4 x i32>, ptr %addr, align 8
  %1 = zext i16 %p to i32
  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
  %3 = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4i32.v4i32.v4i1(<4 x i32> %0, i32 88, <4 x i1> %2)
  %4 = extractvalue { <4 x i32>, <4 x i32> } %3, 1
  store <4 x i32> %4, ptr %addr, align 8
  %5 = extractvalue { <4 x i32>, <4 x i32> } %3, 0
  ret <4 x i32> %5
}

define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_base_z_f32(<4 x i32> %addr, i16 zeroext %p) {
; CHECK-LABEL: test_vldrwq_gather_base_z_f32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r0
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrwt.u32 q1, [q0, #-300]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
  %2 = call <4 x float> @llvm.arm.mve.vldr.gather.base.predicated.v4f32.v4i32.v4i1(<4 x i32> %addr, i32 -300, <4 x i1> %1)
  ret <4 x float> %2
}

declare <4 x float> @llvm.arm.mve.vldr.gather.base.predicated.v4f32.v4i32.v4i1(<4 x i32>, i32, <4 x i1>)

define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_z_s32(<4 x i32> %addr, i16 zeroext %p) {
; CHECK-LABEL: test_vldrwq_gather_base_z_s32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r0
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrwt.u32 q1, [q0, #440]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
  %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.base.predicated.v4i32.v4i32.v4i1(<4 x i32> %addr, i32 440, <4 x i1> %1)
  ret <4 x i32> %2
}

declare <4 x i32> @llvm.arm.mve.vldr.gather.base.predicated.v4i32.v4i32.v4i1(<4 x i32>, i32, <4 x i1>)

define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_z_u32(<4 x i32> %addr, i16 zeroext %p) {
; CHECK-LABEL: test_vldrwq_gather_base_z_u32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r0
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrwt.u32 q1, [q0, #300]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
  %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.base.predicated.v4i32.v4i32.v4i1(<4 x i32> %addr, i32 300, <4 x i1> %1)
  ret <4 x i32> %2
}

define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_offset_f32(ptr %base, <4 x i32> %offset) {
; CHECK-LABEL: test_vldrwq_gather_offset_f32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrw.u32 q1, [r0, q0]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = call <4 x float> @llvm.arm.mve.vldr.gather.offset.v4f32.p0.v4i32(ptr %base, <4 x i32> %offset, i32 32, i32 0, i32 0)
  ret <4 x float> %0
}

declare <4 x float> @llvm.arm.mve.vldr.gather.offset.v4f32.p0.v4i32(ptr, <4 x i32>, i32, i32, i32)

define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_offset_s32(ptr %base, <4 x i32> %offset) {
; CHECK-LABEL: test_vldrwq_gather_offset_s32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrw.u32 q1, [r0, q0]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0.v4i32(ptr %base, <4 x i32> %offset, i32 32, i32 0, i32 0)
  ret <4 x i32> %0
}


define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_offset_u32(ptr %base, <4 x i32> %offset) {
; CHECK-LABEL: test_vldrwq_gather_offset_u32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrw.u32 q1, [r0, q0]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0.v4i32(ptr %base, <4 x i32> %offset, i32 32, i32 0, i32 1)
  ret <4 x i32> %0
}

define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_offset_z_f32(ptr %base, <4 x i32> %offset, i16 zeroext %p) {
; CHECK-LABEL: test_vldrwq_gather_offset_z_f32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrwt.u32 q1, [r0, q0]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
  %2 = call <4 x float> @llvm.arm.mve.vldr.gather.offset.predicated.v4f32.p0.v4i32.v4i1(ptr %base, <4 x i32> %offset, i32 32, i32 0, i32 0, <4 x i1> %1)
  ret <4 x float> %2
}

declare <4 x float> @llvm.arm.mve.vldr.gather.offset.predicated.v4f32.p0.v4i32.v4i1(ptr, <4 x i32>, i32, i32, i32, <4 x i1>)

define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_offset_z_s32(ptr %base, <4 x i32> %offset, i16 zeroext %p) {
; CHECK-LABEL: test_vldrwq_gather_offset_z_s32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrwt.u32 q1, [r0, q0]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
  %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0.v4i32.v4i1(ptr %base, <4 x i32> %offset, i32 32, i32 0, i32 0, <4 x i1> %1)
  ret <4 x i32> %2
}


define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_offset_z_u32(ptr %base, <4 x i32> %offset, i16 zeroext %p) {
; CHECK-LABEL: test_vldrwq_gather_offset_z_u32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrwt.u32 q1, [r0, q0]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
  %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0.v4i32.v4i1(ptr %base, <4 x i32> %offset, i32 32, i32 0, i32 1, <4 x i1> %1)
  ret <4 x i32> %2
}

define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_shifted_offset_f32(ptr %base, <4 x i32> %offset) {
; CHECK-LABEL: test_vldrwq_gather_shifted_offset_f32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrw.u32 q1, [r0, q0, uxtw #2]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = call <4 x float> @llvm.arm.mve.vldr.gather.offset.v4f32.p0.v4i32(ptr %base, <4 x i32> %offset, i32 32, i32 2, i32 0)
  ret <4 x float> %0
}

define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_shifted_offset_s32(ptr %base, <4 x i32> %offset) {
; CHECK-LABEL: test_vldrwq_gather_shifted_offset_s32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrw.u32 q1, [r0, q0, uxtw #2]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0.v4i32(ptr %base, <4 x i32> %offset, i32 32, i32 2, i32 0)
  ret <4 x i32> %0
}

define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_shifted_offset_u32(ptr %base, <4 x i32> %offset) {
; CHECK-LABEL: test_vldrwq_gather_shifted_offset_u32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrw.u32 q1, [r0, q0, uxtw #2]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0.v4i32(ptr %base, <4 x i32> %offset, i32 32, i32 2, i32 1)
  ret <4 x i32> %0
}

define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_shifted_offset_z_f32(ptr %base, <4 x i32> %offset, i16 zeroext %p) {
; CHECK-LABEL: test_vldrwq_gather_shifted_offset_z_f32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrwt.u32 q1, [r0, q0, uxtw #2]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
  %2 = call <4 x float> @llvm.arm.mve.vldr.gather.offset.predicated.v4f32.p0.v4i32.v4i1(ptr %base, <4 x i32> %offset, i32 32, i32 2, i32 0, <4 x i1> %1)
  ret <4 x float> %2
}

define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_shifted_offset_z_s32(ptr %base, <4 x i32> %offset, i16 zeroext %p) {
; CHECK-LABEL: test_vldrwq_gather_shifted_offset_z_s32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrwt.u32 q1, [r0, q0, uxtw #2]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
  %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0.v4i32.v4i1(ptr %base, <4 x i32> %offset, i32 32, i32 2, i32 0, <4 x i1> %1)
  ret <4 x i32> %2
}

define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_shifted_offset_z_u32(ptr %base, <4 x i32> %offset, i16 zeroext %p) {
; CHECK-LABEL: test_vldrwq_gather_shifted_offset_z_u32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vldrwt.u32 q1, [r0, q0, uxtw #2]
; CHECK-NEXT:    vmov q0, q1
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
  %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0.v4i32.v4i1(ptr %base, <4 x i32> %offset, i32 32, i32 2, i32 1, <4 x i1> %1)
  ret <4 x i32> %2
}

define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_p_s16(ptr %base, <8 x i16> %offset, <8 x i16> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrbq_scatter_offset_p_s16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrbt.16 q1, [r0, q0]
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
  call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v8i16.v8i16.v8i1(ptr %base, <8 x i16> %offset, <8 x i16> %value, i32 8, i32 0, <8 x i1> %1)
  ret void
}

declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v8i16.v8i16.v8i1(ptr, <8 x i16>, <8 x i16>, i32, i32, <8 x i1>)

define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_p_s32(ptr %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrbq_scatter_offset_p_s32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrbt.32 q1, [r0, q0]
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
  call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v4i32.v4i32.v4i1(ptr %base, <4 x i32> %offset, <4 x i32> %value, i32 8, i32 0, <4 x i1> %1)
  ret void
}

declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v4i32.v4i32.v4i1(ptr, <4 x i32>, <4 x i32>, i32, i32, <4 x i1>)

define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_p_s8(ptr %base, <16 x i8> %offset, <16 x i8> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrbq_scatter_offset_p_s8:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrbt.8 q1, [r0, q0]
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
  call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v16i8.v16i8.v16i1(ptr %base, <16 x i8> %offset, <16 x i8> %value, i32 8, i32 0, <16 x i1> %1)
  ret void
}

declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v16i8.v16i8.v16i1(ptr, <16 x i8>, <16 x i8>, i32, i32, <16 x i1>)

define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_p_u16(ptr %base, <8 x i16> %offset, <8 x i16> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrbq_scatter_offset_p_u16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrbt.16 q1, [r0, q0]
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
  call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v8i16.v8i16.v8i1(ptr %base, <8 x i16> %offset, <8 x i16> %value, i32 8, i32 0, <8 x i1> %1)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_p_u32(ptr %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrbq_scatter_offset_p_u32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrbt.32 q1, [r0, q0]
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
  call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v4i32.v4i32.v4i1(ptr %base, <4 x i32> %offset, <4 x i32> %value, i32 8, i32 0, <4 x i1> %1)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_p_u8(ptr %base, <16 x i8> %offset, <16 x i8> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrbq_scatter_offset_p_u8:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrbt.8 q1, [r0, q0]
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
  call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v16i8.v16i8.v16i1(ptr %base, <16 x i8> %offset, <16 x i8> %value, i32 8, i32 0, <16 x i1> %1)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_s16(ptr %base, <8 x i16> %offset, <8 x i16> %value) {
; CHECK-LABEL: test_vstrbq_scatter_offset_s16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vstrb.16 q1, [r0, q0]
; CHECK-NEXT:    bx lr
entry:
  call void @llvm.arm.mve.vstr.scatter.offset.p0.v8i16.v8i16(ptr %base, <8 x i16> %offset, <8 x i16> %value, i32 8, i32 0)
  ret void
}

declare void @llvm.arm.mve.vstr.scatter.offset.p0.v8i16.v8i16(ptr, <8 x i16>, <8 x i16>, i32, i32)

define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_s32(ptr %base, <4 x i32> %offset, <4 x i32> %value) {
; CHECK-LABEL: test_vstrbq_scatter_offset_s32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vstrb.32 q1, [r0, q0]
; CHECK-NEXT:    bx lr
entry:
  call void @llvm.arm.mve.vstr.scatter.offset.p0.v4i32.v4i32(ptr %base, <4 x i32> %offset, <4 x i32> %value, i32 8, i32 0)
  ret void
}

declare void @llvm.arm.mve.vstr.scatter.offset.p0.v4i32.v4i32(ptr, <4 x i32>, <4 x i32>, i32, i32)

define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_s8(ptr %base, <16 x i8> %offset, <16 x i8> %value) {
; CHECK-LABEL: test_vstrbq_scatter_offset_s8:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vstrb.8 q1, [r0, q0]
; CHECK-NEXT:    bx lr
entry:
  call void @llvm.arm.mve.vstr.scatter.offset.p0.v16i8.v16i8(ptr %base, <16 x i8> %offset, <16 x i8> %value, i32 8, i32 0)
  ret void
}

declare void @llvm.arm.mve.vstr.scatter.offset.p0.v16i8.v16i8(ptr, <16 x i8>, <16 x i8>, i32, i32)

define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_u16(ptr %base, <8 x i16> %offset, <8 x i16> %value) {
; CHECK-LABEL: test_vstrbq_scatter_offset_u16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vstrb.16 q1, [r0, q0]
; CHECK-NEXT:    bx lr
entry:
  call void @llvm.arm.mve.vstr.scatter.offset.p0.v8i16.v8i16(ptr %base, <8 x i16> %offset, <8 x i16> %value, i32 8, i32 0)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_u32(ptr %base, <4 x i32> %offset, <4 x i32> %value) {
; CHECK-LABEL: test_vstrbq_scatter_offset_u32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vstrb.32 q1, [r0, q0]
; CHECK-NEXT:    bx lr
entry:
  call void @llvm.arm.mve.vstr.scatter.offset.p0.v4i32.v4i32(ptr %base, <4 x i32> %offset, <4 x i32> %value, i32 8, i32 0)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_u8(ptr %base, <16 x i8> %offset, <16 x i8> %value) {
; CHECK-LABEL: test_vstrbq_scatter_offset_u8:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vstrb.8 q1, [r0, q0]
; CHECK-NEXT:    bx lr
entry:
  call void @llvm.arm.mve.vstr.scatter.offset.p0.v16i8.v16i8(ptr %base, <16 x i8> %offset, <16 x i8> %value, i32 8, i32 0)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_p_s64(<2 x i64> %addr, <2 x i64> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrdq_scatter_base_p_s64:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r0
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrdt.64 q1, [q0, #888]
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0)
  call void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v2i1(<2 x i64> %addr, i32 888, <2 x i64> %value, <2 x i1> %1)
  ret void
}

declare void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v2i1(<2 x i64>, i32, <2 x i64>, <2 x i1>)

define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_p_u64(<2 x i64> %addr, <2 x i64> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrdq_scatter_base_p_u64:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r0
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrdt.64 q1, [q0, #264]
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0)
  call void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v2i1(<2 x i64> %addr, i32 264, <2 x i64> %value, <2 x i1> %1)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_s64(<2 x i64> %addr, <2 x i64> %value) {
; CHECK-LABEL: test_vstrdq_scatter_base_s64:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vstrd.64 q1, [q0, #408]
; CHECK-NEXT:    bx lr
entry:
  call void @llvm.arm.mve.vstr.scatter.base.v2i64.v2i64(<2 x i64> %addr, i32 408, <2 x i64> %value)
  ret void
}

declare void @llvm.arm.mve.vstr.scatter.base.v2i64.v2i64(<2 x i64>, i32, <2 x i64>)

define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_u64(<2 x i64> %addr, <2 x i64> %value) {
; CHECK-LABEL: test_vstrdq_scatter_base_u64:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vstrd.64 q1, [q0, #-472]
; CHECK-NEXT:    bx lr
entry:
  call void @llvm.arm.mve.vstr.scatter.base.v2i64.v2i64(<2 x i64> %addr, i32 -472, <2 x i64> %value)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_wb_p_s64(ptr %addr, <2 x i64> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrdq_scatter_base_wb_p_s64:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrw.u32 q1, [r0]
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrdt.64 q0, [q1, #248]!
; CHECK-NEXT:    vstrw.32 q1, [r0]
; CHECK-NEXT:    bx lr
entry:
  %0 = load <2 x i64>, ptr %addr, align 8
  %1 = zext i16 %p to i32
  %2 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %1)
  %3 = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v2i1(<2 x i64> %0, i32 248, <2 x i64> %value, <2 x i1> %2)
  store <2 x i64> %3, ptr %addr, align 8
  ret void
}

declare <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v2i1(<2 x i64>, i32, <2 x i64>, <2 x i1>)

define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_wb_p_u64(ptr %addr, <2 x i64> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrdq_scatter_base_wb_p_u64:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrw.u32 q1, [r0]
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrdt.64 q0, [q1, #136]!
; CHECK-NEXT:    vstrw.32 q1, [r0]
; CHECK-NEXT:    bx lr
entry:
  %0 = load <2 x i64>, ptr %addr, align 8
  %1 = zext i16 %p to i32
  %2 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %1)
  %3 = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v2i1(<2 x i64> %0, i32 136, <2 x i64> %value, <2 x i1> %2)
  store <2 x i64> %3, ptr %addr, align 8
  ret void
}

define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_wb_s64(ptr %addr, <2 x i64> %value) {
; CHECK-LABEL: test_vstrdq_scatter_base_wb_s64:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrw.u32 q1, [r0]
; CHECK-NEXT:    vstrd.64 q0, [q1, #208]!
; CHECK-NEXT:    vstrw.32 q1, [r0]
; CHECK-NEXT:    bx lr
entry:
  %0 = load <2 x i64>, ptr %addr, align 8
  %1 = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.v2i64.v2i64(<2 x i64> %0, i32 208, <2 x i64> %value)
  store <2 x i64> %1, ptr %addr, align 8
  ret void
}

declare <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.v2i64.v2i64(<2 x i64>, i32, <2 x i64>)

define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_wb_u64(ptr %addr, <2 x i64> %value) {
; CHECK-LABEL: test_vstrdq_scatter_base_wb_u64:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrw.u32 q1, [r0]
; CHECK-NEXT:    vstrd.64 q0, [q1, #-168]!
; CHECK-NEXT:    vstrw.32 q1, [r0]
; CHECK-NEXT:    bx lr
entry:
  %0 = load <2 x i64>, ptr %addr, align 8
  %1 = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.v2i64.v2i64(<2 x i64> %0, i32 -168, <2 x i64> %value)
  store <2 x i64> %1, ptr %addr, align 8
  ret void
}

define arm_aapcs_vfpcc void @test_vstrdq_scatter_offset_p_s64(ptr %base, <2 x i64> %offset, <2 x i64> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrdq_scatter_offset_p_s64:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrdt.64 q1, [r0, q0]
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0)
  call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v2i64.v2i64.v2i1(ptr %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 0, <2 x i1> %1)
  ret void
}

declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v2i64.v2i64.v2i1(ptr, <2 x i64>, <2 x i64>, i32, i32, <2 x i1>)

define arm_aapcs_vfpcc void @test_vstrdq_scatter_offset_p_u64(ptr %base, <2 x i64> %offset, <2 x i64> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrdq_scatter_offset_p_u64:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrdt.64 q1, [r0, q0]
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0)
  call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v2i64.v2i64.v2i1(ptr %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 0, <2 x i1> %1)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrdq_scatter_offset_s64(ptr %base, <2 x i64> %offset, <2 x i64> %value) {
; CHECK-LABEL: test_vstrdq_scatter_offset_s64:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vstrd.64 q1, [r0, q0]
; CHECK-NEXT:    bx lr
entry:
  call void @llvm.arm.mve.vstr.scatter.offset.p0.v2i64.v2i64(ptr %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 0)
  ret void
}

declare void @llvm.arm.mve.vstr.scatter.offset.p0.v2i64.v2i64(ptr, <2 x i64>, <2 x i64>, i32, i32)

define arm_aapcs_vfpcc void @test_vstrdq_scatter_offset_u64(ptr %base, <2 x i64> %offset, <2 x i64> %value) {
; CHECK-LABEL: test_vstrdq_scatter_offset_u64:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vstrd.64 q1, [r0, q0]
; CHECK-NEXT:    bx lr
entry:
  call void @llvm.arm.mve.vstr.scatter.offset.p0.v2i64.v2i64(ptr %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 0)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrdq_scatter_shifted_offset_p_s64(ptr %base, <2 x i64> %offset, <2 x i64> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrdq_scatter_shifted_offset_p_s64:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrdt.64 q1, [r0, q0, uxtw #3]
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0)
  call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v2i64.v2i64.v2i1(ptr %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 3, <2 x i1> %1)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrdq_scatter_shifted_offset_p_u64(ptr %base, <2 x i64> %offset, <2 x i64> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrdq_scatter_shifted_offset_p_u64:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrdt.64 q1, [r0, q0, uxtw #3]
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %0)
  call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v2i64.v2i64.v2i1(ptr %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 3, <2 x i1> %1)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrdq_scatter_shifted_offset_s64(ptr %base, <2 x i64> %offset, <2 x i64> %value) {
; CHECK-LABEL: test_vstrdq_scatter_shifted_offset_s64:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vstrd.64 q1, [r0, q0, uxtw #3]
; CHECK-NEXT:    bx lr
entry:
  call void @llvm.arm.mve.vstr.scatter.offset.p0.v2i64.v2i64(ptr %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 3)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrdq_scatter_shifted_offset_u64(ptr %base, <2 x i64> %offset, <2 x i64> %value) {
; CHECK-LABEL: test_vstrdq_scatter_shifted_offset_u64:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vstrd.64 q1, [r0, q0, uxtw #3]
; CHECK-NEXT:    bx lr
entry:
  call void @llvm.arm.mve.vstr.scatter.offset.p0.v2i64.v2i64(ptr %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 3)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_f16(ptr %base, <8 x i16> %offset, <8 x half> %value) {
; CHECK-LABEL: test_vstrhq_scatter_offset_f16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vstrh.16 q1, [r0, q0]
; CHECK-NEXT:    bx lr
entry:
  call void @llvm.arm.mve.vstr.scatter.offset.p0.v8i16.v8f16(ptr %base, <8 x i16> %offset, <8 x half> %value, i32 16, i32 0)
  ret void
}

declare void @llvm.arm.mve.vstr.scatter.offset.p0.v8i16.v8f16(ptr, <8 x i16>, <8 x half>, i32, i32)

define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_p_f16(ptr %base, <8 x i16> %offset, <8 x half> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrhq_scatter_offset_p_f16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrht.16 q1, [r0, q0]
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
  call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v8i16.v8f16.v8i1(ptr %base, <8 x i16> %offset, <8 x half> %value, i32 16, i32 0, <8 x i1> %1)
  ret void
}

declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v8i16.v8f16.v8i1(ptr, <8 x i16>, <8 x half>, i32, i32, <8 x i1>)

define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_p_s16(ptr %base, <8 x i16> %offset, <8 x i16> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrhq_scatter_offset_p_s16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrht.16 q1, [r0, q0]
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
  call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v8i16.v8i16.v8i1(ptr %base, <8 x i16> %offset, <8 x i16> %value, i32 16, i32 0, <8 x i1> %1)
  ret void
}


define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_p_s32(ptr %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrhq_scatter_offset_p_s32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrht.32 q1, [r0, q0]
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
  call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v4i32.v4i32.v4i1(ptr %base, <4 x i32> %offset, <4 x i32> %value, i32 16, i32 0, <4 x i1> %1)
  ret void
}


define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_p_u16(ptr %base, <8 x i16> %offset, <8 x i16> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrhq_scatter_offset_p_u16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrht.16 q1, [r0, q0]
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
  call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v8i16.v8i16.v8i1(ptr %base, <8 x i16> %offset, <8 x i16> %value, i32 16, i32 0, <8 x i1> %1)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_p_u32(ptr %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrhq_scatter_offset_p_u32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrht.32 q1, [r0, q0]
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
  call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v4i32.v4i32.v4i1(ptr %base, <4 x i32> %offset, <4 x i32> %value, i32 16, i32 0, <4 x i1> %1)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_s16(ptr %base, <8 x i16> %offset, <8 x i16> %value) {
; CHECK-LABEL: test_vstrhq_scatter_offset_s16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vstrh.16 q1, [r0, q0]
; CHECK-NEXT:    bx lr
entry:
  call void @llvm.arm.mve.vstr.scatter.offset.p0.v8i16.v8i16(ptr %base, <8 x i16> %offset, <8 x i16> %value, i32 16, i32 0)
  ret void
}


define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_s32(ptr %base, <4 x i32> %offset, <4 x i32> %value) {
; CHECK-LABEL: test_vstrhq_scatter_offset_s32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vstrh.32 q1, [r0, q0]
; CHECK-NEXT:    bx lr
entry:
  call void @llvm.arm.mve.vstr.scatter.offset.p0.v4i32.v4i32(ptr %base, <4 x i32> %offset, <4 x i32> %value, i32 16, i32 0)
  ret void
}


define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_u16(ptr %base, <8 x i16> %offset, <8 x i16> %value) {
; CHECK-LABEL: test_vstrhq_scatter_offset_u16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vstrh.16 q1, [r0, q0]
; CHECK-NEXT:    bx lr
entry:
  call void @llvm.arm.mve.vstr.scatter.offset.p0.v8i16.v8i16(ptr %base, <8 x i16> %offset, <8 x i16> %value, i32 16, i32 0)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_u32(ptr %base, <4 x i32> %offset, <4 x i32> %value) {
; CHECK-LABEL: test_vstrhq_scatter_offset_u32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vstrh.32 q1, [r0, q0]
; CHECK-NEXT:    bx lr
entry:
  call void @llvm.arm.mve.vstr.scatter.offset.p0.v4i32.v4i32(ptr %base, <4 x i32> %offset, <4 x i32> %value, i32 16, i32 0)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_f16(ptr %base, <8 x i16> %offset, <8 x half> %value) {
; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_f16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vstrh.16 q1, [r0, q0, uxtw #1]
; CHECK-NEXT:    bx lr
entry:
  call void @llvm.arm.mve.vstr.scatter.offset.p0.v8i16.v8f16(ptr %base, <8 x i16> %offset, <8 x half> %value, i32 16, i32 1)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_p_f16(ptr %base, <8 x i16> %offset, <8 x half> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_p_f16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrht.16 q1, [r0, q0, uxtw #1]
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
  call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v8i16.v8f16.v8i1(ptr %base, <8 x i16> %offset, <8 x half> %value, i32 16, i32 1, <8 x i1> %1)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_p_s16(ptr %base, <8 x i16> %offset, <8 x i16> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_p_s16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrht.16 q1, [r0, q0, uxtw #1]
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
  call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v8i16.v8i16.v8i1(ptr %base, <8 x i16> %offset, <8 x i16> %value, i32 16, i32 1, <8 x i1> %1)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_p_s32(ptr %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_p_s32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrht.32 q1, [r0, q0, uxtw #1]
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
  call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v4i32.v4i32.v4i1(ptr %base, <4 x i32> %offset, <4 x i32> %value, i32 16, i32 1, <4 x i1> %1)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_p_u16(ptr %base, <8 x i16> %offset, <8 x i16> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_p_u16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrht.16 q1, [r0, q0, uxtw #1]
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
  call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v8i16.v8i16.v8i1(ptr %base, <8 x i16> %offset, <8 x i16> %value, i32 16, i32 1, <8 x i1> %1)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_p_u32(ptr %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_p_u32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrht.32 q1, [r0, q0, uxtw #1]
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
  call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v4i32.v4i32.v4i1(ptr %base, <4 x i32> %offset, <4 x i32> %value, i32 16, i32 1, <4 x i1> %1)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_s16(ptr %base, <8 x i16> %offset, <8 x i16> %value) {
; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_s16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vstrh.16 q1, [r0, q0, uxtw #1]
; CHECK-NEXT:    bx lr
entry:
  call void @llvm.arm.mve.vstr.scatter.offset.p0.v8i16.v8i16(ptr %base, <8 x i16> %offset, <8 x i16> %value, i32 16, i32 1)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_s32(ptr %base, <4 x i32> %offset, <4 x i32> %value) {
; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_s32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vstrh.32 q1, [r0, q0, uxtw #1]
; CHECK-NEXT:    bx lr
entry:
  call void @llvm.arm.mve.vstr.scatter.offset.p0.v4i32.v4i32(ptr %base, <4 x i32> %offset, <4 x i32> %value, i32 16, i32 1)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_u16(ptr %base, <8 x i16> %offset, <8 x i16> %value) {
; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_u16:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vstrh.16 q1, [r0, q0, uxtw #1]
; CHECK-NEXT:    bx lr
entry:
  call void @llvm.arm.mve.vstr.scatter.offset.p0.v8i16.v8i16(ptr %base, <8 x i16> %offset, <8 x i16> %value, i32 16, i32 1)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_u32(ptr %base, <4 x i32> %offset, <4 x i32> %value) {
; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_u32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vstrh.32 q1, [r0, q0, uxtw #1]
; CHECK-NEXT:    bx lr
entry:
  call void @llvm.arm.mve.vstr.scatter.offset.p0.v4i32.v4i32(ptr %base, <4 x i32> %offset, <4 x i32> %value, i32 16, i32 1)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_f32(<4 x i32> %addr, <4 x float> %value) {
; CHECK-LABEL: test_vstrwq_scatter_base_f32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vstrw.32 q1, [q0, #380]
; CHECK-NEXT:    bx lr
entry:
  call void @llvm.arm.mve.vstr.scatter.base.v4i32.v4f32(<4 x i32> %addr, i32 380, <4 x float> %value)
  ret void
}

declare void @llvm.arm.mve.vstr.scatter.base.v4i32.v4f32(<4 x i32>, i32, <4 x float>)

define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_p_f32(<4 x i32> %addr, <4 x float> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrwq_scatter_base_p_f32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r0
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrwt.32 q1, [q0, #-400]
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
  call void @llvm.arm.mve.vstr.scatter.base.predicated.v4i32.v4f32.v4i1(<4 x i32> %addr, i32 -400, <4 x float> %value, <4 x i1> %1)
  ret void
}

declare void @llvm.arm.mve.vstr.scatter.base.predicated.v4i32.v4f32.v4i1(<4 x i32>, i32, <4 x float>, <4 x i1>)

define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_p_s32(<4 x i32> %addr, <4 x i32> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrwq_scatter_base_p_s32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r0
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrwt.32 q1, [q0, #48]
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
  call void @llvm.arm.mve.vstr.scatter.base.predicated.v4i32.v4i32.v4i1(<4 x i32> %addr, i32 48, <4 x i32> %value, <4 x i1> %1)
  ret void
}

declare void @llvm.arm.mve.vstr.scatter.base.predicated.v4i32.v4i32.v4i1(<4 x i32>, i32, <4 x i32>, <4 x i1>)

define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_p_u32(<4 x i32> %addr, <4 x i32> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrwq_scatter_base_p_u32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r0
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrwt.32 q1, [q0, #-376]
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
  call void @llvm.arm.mve.vstr.scatter.base.predicated.v4i32.v4i32.v4i1(<4 x i32> %addr, i32 -376, <4 x i32> %value, <4 x i1> %1)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_s32(<4 x i32> %addr, <4 x i32> %value) {
; CHECK-LABEL: test_vstrwq_scatter_base_s32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vstrw.32 q1, [q0, #156]
; CHECK-NEXT:    bx lr
entry:
  call void @llvm.arm.mve.vstr.scatter.base.v4i32.v4i32(<4 x i32> %addr, i32 156, <4 x i32> %value)
  ret void
}

declare void @llvm.arm.mve.vstr.scatter.base.v4i32.v4i32(<4 x i32>, i32, <4 x i32>)

define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_u32(<4 x i32> %addr, <4 x i32> %value) {
; CHECK-LABEL: test_vstrwq_scatter_base_u32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vstrw.32 q1, [q0, #212]
; CHECK-NEXT:    bx lr
entry:
  call void @llvm.arm.mve.vstr.scatter.base.v4i32.v4i32(<4 x i32> %addr, i32 212, <4 x i32> %value)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_wb_f32(ptr %addr, <4 x float> %value) {
; CHECK-LABEL: test_vstrwq_scatter_base_wb_f32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrw.u32 q1, [r0]
; CHECK-NEXT:    vstrw.32 q0, [q1, #-412]!
; CHECK-NEXT:    vstrw.32 q1, [r0]
; CHECK-NEXT:    bx lr
entry:
  %0 = load <4 x i32>, ptr %addr, align 8
  %1 = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.v4i32.v4f32(<4 x i32> %0, i32 -412, <4 x float> %value)
  store <4 x i32> %1, ptr %addr, align 8
  ret void
}

declare <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.v4i32.v4f32(<4 x i32>, i32, <4 x float>)

define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_wb_p_f32(ptr %addr, <4 x float> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrwq_scatter_base_wb_p_f32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrw.u32 q1, [r0]
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrwt.32 q0, [q1, #236]!
; CHECK-NEXT:    vstrw.32 q1, [r0]
; CHECK-NEXT:    bx lr
entry:
  %0 = load <4 x i32>, ptr %addr, align 8
  %1 = zext i16 %p to i32
  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
  %3 = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v4i32.v4f32.v4i1(<4 x i32> %0, i32 236, <4 x float> %value, <4 x i1> %2)
  store <4 x i32> %3, ptr %addr, align 8
  ret void
}

declare <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v4i32.v4f32.v4i1(<4 x i32>, i32, <4 x float>, <4 x i1>)

define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_wb_p_s32(ptr %addr, <4 x i32> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrwq_scatter_base_wb_p_s32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrw.u32 q1, [r0]
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrwt.32 q0, [q1, #328]!
; CHECK-NEXT:    vstrw.32 q1, [r0]
; CHECK-NEXT:    bx lr
entry:
  %0 = load <4 x i32>, ptr %addr, align 8
  %1 = zext i16 %p to i32
  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
  %3 = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v4i32.v4i32.v4i1(<4 x i32> %0, i32 328, <4 x i32> %value, <4 x i1> %2)
  store <4 x i32> %3, ptr %addr, align 8
  ret void
}

declare <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v4i32.v4i32.v4i1(<4 x i32>, i32, <4 x i32>, <4 x i1>)

define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_wb_p_u32(ptr %addr, <4 x i32> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrwq_scatter_base_wb_p_u32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrw.u32 q1, [r0]
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrwt.32 q0, [q1, #412]!
; CHECK-NEXT:    vstrw.32 q1, [r0]
; CHECK-NEXT:    bx lr
entry:
  %0 = load <4 x i32>, ptr %addr, align 8
  %1 = zext i16 %p to i32
  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
  %3 = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v4i32.v4i32.v4i1(<4 x i32> %0, i32 412, <4 x i32> %value, <4 x i1> %2)
  store <4 x i32> %3, ptr %addr, align 8
  ret void
}

define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_wb_s32(ptr %addr, <4 x i32> %value) {
; CHECK-LABEL: test_vstrwq_scatter_base_wb_s32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrw.u32 q1, [r0]
; CHECK-NEXT:    vstrw.32 q0, [q1, #-152]!
; CHECK-NEXT:    vstrw.32 q1, [r0]
; CHECK-NEXT:    bx lr
entry:
  %0 = load <4 x i32>, ptr %addr, align 8
  %1 = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.v4i32.v4i32(<4 x i32> %0, i32 -152, <4 x i32> %value)
  store <4 x i32> %1, ptr %addr, align 8
  ret void
}

declare <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.v4i32.v4i32(<4 x i32>, i32, <4 x i32>)

define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_wb_u32(ptr %addr, <4 x i32> %value) {
; CHECK-LABEL: test_vstrwq_scatter_base_wb_u32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vldrw.u32 q1, [r0]
; CHECK-NEXT:    vstrw.32 q0, [q1, #64]!
; CHECK-NEXT:    vstrw.32 q1, [r0]
; CHECK-NEXT:    bx lr
entry:
  %0 = load <4 x i32>, ptr %addr, align 8
  %1 = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.v4i32.v4i32(<4 x i32> %0, i32 64, <4 x i32> %value)
  store <4 x i32> %1, ptr %addr, align 8
  ret void
}

define arm_aapcs_vfpcc void @test_vstrwq_scatter_offset_f32(ptr %base, <4 x i32> %offset, <4 x float> %value) {
; CHECK-LABEL: test_vstrwq_scatter_offset_f32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vstrw.32 q1, [r0, q0]
; CHECK-NEXT:    bx lr
entry:
  call void @llvm.arm.mve.vstr.scatter.offset.p0.v4i32.v4f32(ptr %base, <4 x i32> %offset, <4 x float> %value, i32 32, i32 0)
  ret void
}

declare void @llvm.arm.mve.vstr.scatter.offset.p0.v4i32.v4f32(ptr, <4 x i32>, <4 x float>, i32, i32)

define arm_aapcs_vfpcc void @test_vstrwq_scatter_offset_p_f32(ptr %base, <4 x i32> %offset, <4 x float> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrwq_scatter_offset_p_f32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrwt.32 q1, [r0, q0]
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
  call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v4i32.v4f32.v4i1(ptr %base, <4 x i32> %offset, <4 x float> %value, i32 32, i32 0, <4 x i1> %1)
  ret void
}

declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v4i32.v4f32.v4i1(ptr, <4 x i32>, <4 x float>, i32, i32, <4 x i1>)

define arm_aapcs_vfpcc void @test_vstrwq_scatter_offset_p_s32(ptr %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrwq_scatter_offset_p_s32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrwt.32 q1, [r0, q0]
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
  call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v4i32.v4i32.v4i1(ptr %base, <4 x i32> %offset, <4 x i32> %value, i32 32, i32 0, <4 x i1> %1)
  ret void
}


define arm_aapcs_vfpcc void @test_vstrwq_scatter_offset_p_u32(ptr %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrwq_scatter_offset_p_u32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrwt.32 q1, [r0, q0]
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
  call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v4i32.v4i32.v4i1(ptr %base, <4 x i32> %offset, <4 x i32> %value, i32 32, i32 0, <4 x i1> %1)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrwq_scatter_offset_s32(ptr %base, <4 x i32> %offset, <4 x i32> %value) {
; CHECK-LABEL: test_vstrwq_scatter_offset_s32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vstrw.32 q1, [r0, q0]
; CHECK-NEXT:    bx lr
entry:
  call void @llvm.arm.mve.vstr.scatter.offset.p0.v4i32.v4i32(ptr %base, <4 x i32> %offset, <4 x i32> %value, i32 32, i32 0)
  ret void
}


define arm_aapcs_vfpcc void @test_vstrwq_scatter_offset_u32(ptr %base, <4 x i32> %offset, <4 x i32> %value) {
; CHECK-LABEL: test_vstrwq_scatter_offset_u32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vstrw.32 q1, [r0, q0]
; CHECK-NEXT:    bx lr
entry:
  call void @llvm.arm.mve.vstr.scatter.offset.p0.v4i32.v4i32(ptr %base, <4 x i32> %offset, <4 x i32> %value, i32 32, i32 0)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrwq_scatter_shifted_offset_f32(ptr %base, <4 x i32> %offset, <4 x float> %value) {
; CHECK-LABEL: test_vstrwq_scatter_shifted_offset_f32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vstrw.32 q1, [r0, q0, uxtw #2]
; CHECK-NEXT:    bx lr
entry:
  call void @llvm.arm.mve.vstr.scatter.offset.p0.v4i32.v4f32(ptr %base, <4 x i32> %offset, <4 x float> %value, i32 32, i32 2)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrwq_scatter_shifted_offset_p_f32(ptr %base, <4 x i32> %offset, <4 x float> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrwq_scatter_shifted_offset_p_f32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrwt.32 q1, [r0, q0, uxtw #2]
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
  call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v4i32.v4f32.v4i1(ptr %base, <4 x i32> %offset, <4 x float> %value, i32 32, i32 2, <4 x i1> %1)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrwq_scatter_shifted_offset_p_s32(ptr %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrwq_scatter_shifted_offset_p_s32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrwt.32 q1, [r0, q0, uxtw #2]
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
  call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v4i32.v4i32.v4i1(ptr %base, <4 x i32> %offset, <4 x i32> %value, i32 32, i32 2, <4 x i1> %1)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrwq_scatter_shifted_offset_p_u32(ptr %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) {
; CHECK-LABEL: test_vstrwq_scatter_shifted_offset_p_u32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vmsr p0, r1
; CHECK-NEXT:    vpst
; CHECK-NEXT:    vstrwt.32 q1, [r0, q0, uxtw #2]
; CHECK-NEXT:    bx lr
entry:
  %0 = zext i16 %p to i32
  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
  call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0.v4i32.v4i32.v4i1(ptr %base, <4 x i32> %offset, <4 x i32> %value, i32 32, i32 2, <4 x i1> %1)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrwq_scatter_shifted_offset_s32(ptr %base, <4 x i32> %offset, <4 x i32> %value) {
; CHECK-LABEL: test_vstrwq_scatter_shifted_offset_s32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vstrw.32 q1, [r0, q0, uxtw #2]
; CHECK-NEXT:    bx lr
entry:
  call void @llvm.arm.mve.vstr.scatter.offset.p0.v4i32.v4i32(ptr %base, <4 x i32> %offset, <4 x i32> %value, i32 32, i32 2)
  ret void
}

define arm_aapcs_vfpcc void @test_vstrwq_scatter_shifted_offset_u32(ptr %base, <4 x i32> %offset, <4 x i32> %value) {
; CHECK-LABEL: test_vstrwq_scatter_shifted_offset_u32:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    vstrw.32 q1, [r0, q0, uxtw #2]
; CHECK-NEXT:    bx lr
entry:
  call void @llvm.arm.mve.vstr.scatter.offset.p0.v4i32.v4i32(ptr %base, <4 x i32> %offset, <4 x i32> %value, i32 32, i32 2)
  ret void
}
